Tutorial_scRNAseq_analysis

Prerequisites

First we need to import Packages required for the Analysis and register multiple Cores for parallel computation.

library(scRNAseq.analysis)
library(Seurat)
 library("tidyr")
library(cowplot)
library(ggplot2)
library(org.Mm.eg.db)
library(clustree)
library(dplyr)
library(plyr)
library(foreach)
library(clusterProfiler)
library(biomaRt)
library(DOSE)
library(grid)
library(gridExtra)
doMC::registerDoMC(future::availableCores()*0.5)

###only half, as not to use up all resources, change accordingly, if more power required
# system.time(fastSave::load.lbzip2(here::here("RImages/SeqWell_myeolid/Exp_191114","full_analysed.RDataFS"),n.cores=100))

###custom color palette for Seurat, number stands for cluster ID without annotation
my_cols <- c('0'='#006A40FF','1'="#F8E5AE",'2'='#75B41EFF','3'='#95828DFF','4'='#708C98FF',
  '5'='#8AB8CFFF','6'='#007E7FFF','7'='#358359FF','8'='#8BA1BCFF','9'='#5A5895FF',
  '10'='#F2990CFF','11'='#5A5895FF','12'='#E5BA3AFF',"13"="chartreuse","14"='#F08892FF',"15"="#E10520","16"="#13EF48","17"="#E08A9D","18"="#E59E3C")

my_cols2 <- my_cols[order(as.integer(names(my_cols)))]
### show the ID of the color 

Then we need to load the Mart datasets to be able to retrieve more information about our genes.


###execute one and save the object 
mouse_ensembl<-biomaRt::useMart("ensembl", dataset = "mmusculus_gene_ensembl")
human_ensembl<-biomaRt::useMart("ensembl", dataset = "hsapiens_gene_ensembl")

Import Data

Then we import the matrices from a DropSeq pipeline output folder:


 #This path is mounted in Singularity ... to reproduce the results, you can access the data using the following mount in Singularity: --bind /groups/NovaSeq-01/bioinformatics/SeqWell/84_Seq_Beyer_spg15:/data/SeqWell/190613 
dir_RNA <- "/data/SeqWell/190613/alignment/2020-07-17/output/results/samples/"

###Lets create a list for Seurat objects for better overview
Seurat_objects<-list()
Seurat_objects$total<-read_mtx(dir=dir_RNA)
#> 
#> Attaching package: 'Matrix'
#> The following object is masked from 'package:S4Vectors':
#> 
#>     expand
#> The following objects are masked from 'package:tidyr':
#> 
#>     expand, pack, unpack

Or import SmartSeq2 Data using import_SS2_kallisto():

##use ensembl for gene annotation (standard output is in ensembleID, thus we use biomart to convert to human readable gene symbols)
mouse_ensembl<-biomaRt::useMart("ensembl", dataset = "mmusculus_gene_ensembl")

###same as above: --bind /groups/NovaSeq-01/bioinformatics/RNA/124_2019_SS2_Beyer_CNS_TCells_210414:/data/SS2/190613
Seurat_objects$SS2_190613<-import_SS2_kallisto("/data/SS2/190613/alignment/2021-04-15--17-09-37/output/kallisto",mouse_ensembl)

Metadata

The Metadata in our case is directly contained in the Cell name, thus we can just extract it:


meta<-separate(as.data.frame(colnames(Seurat_objects$total)),col = "colnames(Seurat_objects$total)",sep = "_",into=c(NA,"Genotype","Mouse",NA,"Pool"),remove = F)
rownames(meta)<-meta$`colnames(Seurat_objects$total)`
meta<-meta[,-1]
Seurat_objects$total<-AddMetaData(Seurat_objects$total,meta)

We can furthermore add the Percentage of mitochondrial genes:



###lets first find genes, which are located on the mitochondrial chromosome: 
mito.genes<-biomaRt::getBM(attributes=c('external_gene_name', 'transcript_biotype',"chromosome_name"),
                           filters = 'chromosome_name',
                           values = "MT",
                           mart = mouse_ensembl)
##you need to subset first the genes which are actually present in the seurat object
mito.genes<-mito.genes$external_gene_name[mito.genes$external_gene_name%in%rownames(Seurat_objects$total)]
##add percentage of this genes
Seurat_objects$total<-MetaFeature(Seurat_objects$total,features = as.character(mito.genes),meta.name = "percent.mito")

QC

Alignment Quality

First check the alignment quality:



##multiqc statistics
path_191114<-"/data/SeqWell/191114/alignment/2020-03-13/output/results"



align_stats(path_191114,subset="ADT",exclude=T,legend.pos = "right",ymax=150000000,title="191114_v2")
#> Using ID as id variables
#> Scale for 'y' is already present. Adding another scale for 'y', which will
#> replace the existing scale.

Here you can see that all the subsamples from our dataset show similar alignment quality. Around 50-70% of the reads were aligned and most of the reads survived filtering, which is an okay quality. The sequencing depth varied slightly, on average a number of reads in the range of 5.0e+07 was observed.

Composition

We can also check the composition of our transcripts:


### To find the biotypes of transcripts a Biomart object is required. Lets use the one from above
   genetypes_RNA<-gene_types(Seurat_objects$total,mart.use = mouse_ensembl)
#> Joining, by = "SYMBOL"
    
###draw_plot is also a function of cowplot, thus use ::
    genestype_plot<-scRNAseq.analysis::draw_plot(ggplot(genetypes_RNA,aes(x=TYPE,y=SUM)),xlab="",ylab="Sum over all cells",plot=geom_jitter(height = 0, width = 0.1),legend="none")+scale_y_log10()+coord_flip()
    
    genestype_plot

You can see that the majority of transcripts are protein coding genes. Other Transcript types may not be very informative, thus we leave them out for clustering and cluster annotation. Furthermore we’ll remove ribosomal genes, which are also less informative but highly expressed:

###lets find all genes annotated as rRNA in biomart 
ribo.genes<-biomaRt::getBM(attributes=c('external_gene_name', 'transcript_biotype',"chromosome_name"),
                           filters = 'biotype',
                           values = "rRNA",
                           mart = mouse_ensembl)

##not all of them are annotated, thus we select based on known terms

ribo.genes <-  c(grep(pattern =c("^Rpl|^Rps|^Mrps|^Mrpl"), x = rownames(Seurat_objects$total),
                    value = TRUE),ribo.genes$external_gene_name)

#keep protein coding only
Seurat_objects$filtered<-subset(Seurat_objects$total,features=genetypes_RNA[genetypes_RNA$TYPE=="protein_coding",]$SYMBOL)
#> Warning in subset.Assay(x = x[[assay]], cells = cells, features =
#> assay.features): NAs passed in the features vector, removing NAs

###remove ribosomal genes
filt_genes<-rownames(Seurat_objects$filtered)
filt_genes<-filt_genes[which(!filt_genes%in%ribo.genes)]
Seurat_objects$filtered<-subset(Seurat_objects$total,features=filt_genes)
   

Cells

Now lets include low quality cells. We start with removing empty wells:

###save our data in extra object, so we do not loose the previous state, when trying different subsetting criteria.
dataset<-Seurat_objects$filtered

#UMIs per cell####

  UMIs_cell(seurat_total = dataset,cutoff = 40)
#> Warning: Transformation introduced infinite values in continuous y-axis


dataset<-subset(dataset,subset = nCount_RNA>40)

These are all cells, in which the cell size (rank here) is not proportional to the #UMIs (all cells below red dotted line).

Another posibility to look at the quality of cells is the following plot:


dataset<-subset(dataset,subset=nFeature_RNA>370&nCount_RNA>500)
 umis_genes_color(dataset)###standard color is finding column in meta.data called "percent.mito" ... adjust accordingly

 
 ###if you are done save the subsetted dataset in our list
 Seurat_objects$postQC<-dataset

You should try to remove the tails in the histogram, so that we keep only high quality cells. Here we subsetted for at least 370 genes and 500 Umis. No cells with high percent mitochondrial were observed in the lower left quadrant and the mito content is generally low. Thus we do not need to subset for percent.mito.

Dimensionality Reduction and Clustering

We still have to exclude doublets and correct for ambient gene expression. But for this steps a first clustering result should be available. Thus we first start with dimensionality reduction and clustering. The scRNAseq analysis is split into 3 Parts:

Here we use a wrapper function for all these parts called seurat_flow().



###with Seurat >4.0 you need to make sure that Matrix <1.3.2 is installed otherwise following error,when executing FindNeighbors(): 
#Error in validObject(.Object) : 
  #invalid class “Graph” object: superclass "Mnumeric" not defined in the environment of the object's class
### use the following commands: 
# remotes::install_version("Matrix", version = "1.3.2", repos = "http://cran.us.r-project.org")

###umap-learn need to be installed beforehand, if that doesn't work just remove ,umap="uwot"
Seurat_objects$postQC<-seurat_flow(Seurat_objects$postQC,dim=1:21,sct=T,norm=F,umap="uwot") 

To determine the number of dimensions to use, check the elbow in ElbowPlot().

Clustering

We have performed clustering using the standard resolution of 0.7. Let’s see if we can make a more educated guess on the number of cluster available in our dataset.
To that purpose, two packages can be made use of. The first one is called clustree and it just shows us a graph like comparison between the different clustering resolutions. The resolution is selected at the point, when the clusters (nodes) start to mix up between resolutions (mixed edges).


###First remove previous clustering runs 
Seurat_objects$postQC@meta.data<-Seurat_objects$postQC[[]][,-grep("SCT_",names(Seurat_objects$postQC[[]]),fixed = T)]
###perform clustering at resolutions between 0.1 and two in 0.2 steps for example for both Genotypes


for(i in seq(0,1,0.1)){
Seurat_objects$postQC <- FindClusters(Seurat_objects$postQC, resolution = i)
}
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#> 
#> Number of nodes: 5378
#> Number of edges: 218128
#> 
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 1.0000
#> Number of communities: 1
#> Elapsed time: 0 seconds
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#> 
#> Number of nodes: 5378
#> Number of edges: 218128
#> 
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.9385
#> Number of communities: 5
#> Elapsed time: 0 seconds
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#> 
#> Number of nodes: 5378
#> Number of edges: 218128
#> 
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.9055
#> Number of communities: 7
#> Elapsed time: 0 seconds
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#> 
#> Number of nodes: 5378
#> Number of edges: 218128
#> 
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.8843
#> Number of communities: 7
#> Elapsed time: 0 seconds
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#> 
#> Number of nodes: 5378
#> Number of edges: 218128
#> 
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.8656
#> Number of communities: 8
#> Elapsed time: 0 seconds
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#> 
#> Number of nodes: 5378
#> Number of edges: 218128
#> 
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.8493
#> Number of communities: 9
#> Elapsed time: 0 seconds
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#> 
#> Number of nodes: 5378
#> Number of edges: 218128
#> 
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.8347
#> Number of communities: 11
#> Elapsed time: 0 seconds
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#> 
#> Number of nodes: 5378
#> Number of edges: 218128
#> 
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.8210
#> Number of communities: 11
#> Elapsed time: 0 seconds
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#> 
#> Number of nodes: 5378
#> Number of edges: 218128
#> 
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.8081
#> Number of communities: 12
#> Elapsed time: 0 seconds
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#> 
#> Number of nodes: 5378
#> Number of edges: 218128
#> 
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.7975
#> Number of communities: 14
#> Elapsed time: 0 seconds
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#> 
#> Number of nodes: 5378
#> Number of edges: 218128
#> 
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.7865
#> Number of communities: 15
#> Elapsed time: 0 seconds


#Clustree is used to select the proper resolution. In the plot, in.props means the number of
# cells in the edge / number of cells in node where edge is pointing to. If Clusters with a lower 
# resolution are sharing cell in a higher resolution, this resolution is not adequate. You need to set the prefix for the clusters at
# different resolutions in the metadata, the general naming is the following: <Assay>_<Clustering_method>_res.

###check the prefix to use in the metadata colnames 

###execute clustree 
clustree(Seurat_objects$postQC, prefix = "SCT_snn_res.")
#> Warning: The `add` argument of `group_by()` is deprecated as of dplyr 1.0.0.
#> Please use the `.add` argument instead.

DimPlot(Seurat_objects$postQC)

###if only empty wells excluded
Idents(Seurat_objects$postQC)<-Seurat_objects$postQC[["SCT_snn_res.0.4"]]

You can see that there are singular mixes in low resolution and from 0.7 there appear to be slightly less stability. Since we still need to do some QC, lets not go into much detail and take for now resolution 0.4, where the clusters were stable for two resolutions. For doublet and ambient gene correction, we also need to check if the cluster roughly make sense. So let’s just quickly check the umap and marker genes.

library(dplyr)
dataset.markers<-list()

dataset.markers$postQC<-FindAllMarkers(Seurat_objects$postQC)
#> Calculating cluster 0
#> Calculating cluster 1
#> Calculating cluster 2
#> Calculating cluster 3
#> Calculating cluster 4
#> Calculating cluster 5
#> Calculating cluster 6
#> Calculating cluster 7

top_genes<-list()

###functions used in different packages as well, thus specify dplyr

top_genes$postQC<- dataset.markers$postQC %>% dplyr::group_by(cluster) %>% dplyr::top_n(n = 5, 
                                                              wt = avg_log2FC)


plot_grid(DoHeatmap(Seurat_objects$postQC, features = top_genes$postQC$gene)+NoLegend(),DimPlot(Seurat_objects$postQC,label=T)+NoLegend()+NoAxes())
#> Warning in DoHeatmap(Seurat_objects$postQC, features = top_genes$postQC$gene):
#> The following features were omitted as they were not found in the scale.data
#> slot for the SCT assay: Rack1, Csf1r

You can see, that the cluster are not mixed up and the marker genes make sense.

Correct Ambient Genes

Ambient genes are especially abundant in droplet or pico-well Sequencing technologies. Thus, it is advisable to perform correction for ambient genes for this dataset (SeqWell, pico-well).
To that purpose we can use the tool SoupX (constantAmateur/SoupX). Here a wrapper function is used … which extracts the count data from our unfiltered raw dataset and the QC post dataset, with preliminary clusters. The clustering beforehand is required for automatic SoupX correction.

#correct for ambient genes
par(mfrow = c(1,2), xpd=TRUE)
Seurat_objects$nosoup<-remove_ambient(Seurat_objects$postQC,Seurat_objects$filtered)
#> 182 genes passed tf-idf cut-off and 38 soup quantile filter.  Taking the top 38.
#> Using 80 independent estimates of rho.
#> Estimated global rho of 0.03
#> Warning in sparseMatrix(i = out@i[w] + 1, j = out@j[w] + 1, x = out@x[w], :
#> 'giveCsparse' has been deprecated; setting 'repr = "T"' for you
#> Expanding counts from 8 clusters to 5378 cells.
mito.genes<-mito.genes[mito.genes%in%rownames(Seurat_objects$nosoup)]
Seurat_objects$nosoup<-MetaFeature(Seurat_objects$nosoup,mito.genes,"percent.mito")
plot_grid(umis_genes_color(Seurat_objects$postQC),umis_genes_color(Seurat_objects$nosoup), labels = c("Pre-correction","Post-correction"))

From this plot we can see, that the maximal estimated contamination fraction is 3%, which lays in the range of expected contamination in SeqWell experiments.


Seurat_objects$nosoup<-seurat_flow(Seurat_objects$nosoup,res=0.4,dim=1:21,umap="uwot")
#compare ambient gene corrected dataset and unGenotcorrected
##re-add percent.mito


plot_grid(umis_genes_color(Seurat_objects$postQC),umis_genes_color(Seurat_objects$nosoup), labels = c("Pre-correction","Post-correction"))

You can already see a huge difference in the QC metrics. It seems that cells with very high number of genes and UMIs mainly contained information about ambient genes. The correction, made also the cells more comparable, bringing them into a similar range. Let’s also check the clustering. Here the UMAPs:

dataset.markers$nosoup<-FindAllMarkers(Seurat_objects$nosoup, only.pos = TRUE)
#> Calculating cluster 0
#> Calculating cluster 1
#> Calculating cluster 2
#> Calculating cluster 3
#> Calculating cluster 4
#> Calculating cluster 5
#> Calculating cluster 6
#> Calculating cluster 7
#> Calculating cluster 8


top_genes$nosoup<- dataset.markers$nosoup %>% dplyr::group_by(cluster) %>% dplyr::top_n(n = 5, 
                                                               wt = avg_log2FC)

plot_grid(DimPlot(Seurat_objects$postQC,label=T)+NoLegend()+NoAxes(),DimPlot(Seurat_objects$nosoup,label=T)+NoLegend()+NoAxes(), labels = c("Pre-correction","Post-correction"))

You can see that we find more clusters with the same resolution and the clusters found before separate also better after correction. Lets check The marker genes as well:


plot_grid(DoHeatmap(Seurat_objects$postQC, features = top_genes$postQC$gene)+NoLegend(),DoHeatmap(Seurat_objects$nosoup, features = top_genes$nosoup$gene)+NoLegend(), labels = c("Pre-correction","Post-correction"))
#> Warning in DoHeatmap(Seurat_objects$postQC, features = top_genes$postQC$gene):
#> The following features were omitted as they were not found in the scale.data
#> slot for the SCT assay: Rack1, Csf1r
#> Warning in DoHeatmap(Seurat_objects$nosoup, features = top_genes$nosoup$gene):
#> The following features were omitted as they were not found in the scale.data
#> slot for the SCT assay: Rack1, Csf1r, Cx3cr1

The marker genes are similar, although after correction Nfkbia and Mrc1 appear, which tell us more about the state (Nfkb=Inflammation) and identity (Mrc1 = Macrophage) of certain populations.

Doublet removal

It is recommended to perform doublet detection separately on each of the present conditions. In our case, we perform the detection on the Genotypes WT and KO. Prior to run the wrapper function for DoubletFinder, the QC for the separate Genotypes might be adjusted slightly and uninformative clusters removed. I will not go here into detail, because the major steps were already explained above:

Seurat_objects[c("WT","KO")]<-NULL
Genotypes<-SplitObject(Seurat_objects$nosoup,split.by = "Genotype")
Seurat_objects<-c(Seurat_objects,Genotypes)

###We needed Genotypes only as a helper variable, so lets remove it again
rm(Genotypes)

##rerrun seurat for soup corrected genotypes
###do not use umap-learn when performing seurat_flow in parallel, otherwise error: 
#  Terminating: fork() called from a process already using GNU OpenMP, this is unsafe.
Seurat_objects[c("WT","KO")]<- llply(.data = Seurat_objects[c("WT","KO")], .fun =seurat_flow,res=.6,dim=1:21,sct=T,alg=1,.parallel = T,.paropts = list(.packages = "Seurat"))

###analyse WT first
for(i in seq(0.1,1.0,0.1)){
Seurat_objects$WT<- FindClusters(Seurat_objects$WT, resolution = i)
}

# clustree(Seurat_objects$WT, prefix = "SCT_snn_res.")

Idents(Seurat_objects$WT)<-Seurat_objects$WT[["SCT_snn_res.0.6"]]

# VlnPlot(Seurat_objects$WT,features = "nCount_RNA")
Seurat_objects$WT<-subset(Seurat_objects$WT,subset=nCount_RNA>580)


# VlnPlot(Seurat_objects$WT,features = "percent.mito")

###analyse KO

for(i in seq(1.0,2.0,0.1)){
Seurat_objects$KO<- FindClusters(Seurat_objects$KO, resolution = i)
}

# Seurat_objects$KO@meta.data<-Seurat_objects$KO[[]][,-grep("SCT_",colnames(Seurat_objects$KO[[]]),fixed = T)]

# clustree(Seurat_objects$KO, prefix = "SCT_snn_res.")

Idents(Seurat_objects$KO)<-Seurat_objects$KO[["SCT_snn_res.1.3"]]

# plot_grid(VlnPlot(Seurat_objects$KO,features = "nCount_RNA"),VlnPlot(Seurat_objects$KO,features = "percent.mito"))

Seurat_objects$KO<-subset(Seurat_objects$KO,subset=nCount_RNA>580)


  dataset.markers[c("WT","KO")]<-llply(Seurat_objects[c("WT","KO")],FindAllMarkers, only.pos = TRUE,.parallel=T,.paropts = list(.packages="Seurat"))
    
    for(i in c("WT","KO")){
    top_genes[[i]]<- dataset.markers[[i]] %>% dplyr::group_by(cluster) %>% dplyr::top_n(n = 5, 
                                                                   wt = avg_log2FC)
    }

  plot_grid(plot_grid(DoHeatmap(Seurat_objects$KO,top_genes$KO$gene)+NoLegend(),
DimPlot(Seurat_objects$KO)+NoAxes()),plot_grid(DoHeatmap(Seurat_objects$WT,top_genes$WT$gene)+NoLegend(),
DimPlot(Seurat_objects$WT)+NoAxes()))

So now that we now that our clusters make sense, we can use DoubletFinder.

Seurat_objects$WT<-doublets(Seurat_objects$WT,dim = 1:21)


Seurat_objects$KO<-doublets(Seurat_objects$KO,dim = 1:21)


###the meta col name for doublets might vary slightly, but starts always with "DF.classifications" 

name_doublet_wt<-grep("^DF.classifications_",names(Seurat_objects$WT[[]]),value = T)
name_doublet_ko<-grep("^DF.classifications_",names(Seurat_objects$KO[[]]),value = T)

###visualise doublets

plot_grid(plot_grid(DimPlot(Seurat_objects$WT,group.by = name_doublet_wt)+labs(title=""),DimPlot(Seurat_objects$WT)),
plot_grid(DimPlot(Seurat_objects$KO,group.by = name_doublet_ko)+labs(title=""),DimPlot(Seurat_objects$KO)),labels = c("WT","KO"))



###lets remove the doublets
###since we do not know the exact column way, we need to use Fetch data as a workaround

expr <- FetchData(Seurat_objects$WT, vars = name_doublet_wt)
Seurat_objects$WT <- Seurat_objects$WT[, which(expr == "Singlet")]

expr <- FetchData(Seurat_objects$KO, vars = name_doublet_ko)
Seurat_objects$KO <- Seurat_objects$KO[, which(expr == "Singlet")]

The number of doublets is low in this datasets, let’s remove the ones which were found.
Lets now merge again the corrected Genotypes together:

###The Genotypes were merged after ambient gene and Doublet removal
###Error: Error in UseMethod(generic = "DefaultAssay", object = object) : 
  # no applicable method for 'DefaultAssay' applied to an object of class "NULL" 
#Thus genotypes are saved intermediately
WT<-Seurat_objects$WT
KO<-Seurat_objects$KO
Seurat_objects$processed<-merge(WT,KO)

###remove the intermediate variables again

rm(WT,KO)

Downstream analysis

###WT should always appear at the left side
 Seurat_objects$processed@meta.data$Genotype<-relevel(as.factor(Seurat_objects$processed@meta.data$Genotype),"WT")

###umap-learn needs to be installed... if error just remove umap="umap-learn"
Seurat_objects$processed<-seurat_flow(Seurat_objects$processed,res=0.5,dim = 1:21,umap="uwot",only.var = F)

Optimal Clusters



plot_grid(DimPlot(Seurat_objects$processed,reduction="umap",group.by = "Genotype")+NoAxes()+NoLegend(),
DimPlot(Seurat_objects$processed,reduction="umap",group.by = "Mouse")+NoAxes()+NoLegend(),
DimPlot(Seurat_objects$processed,reduction="umap",group.by = "Pool")+NoAxes()+NoLegend())

###The cells do not separate according to the genotypes ... thus batch correction or integration not required, Samples seems also to be equally distributed 

names(Seurat_objects$processed[[]])
#>  [1] "orig.ident"                      "nCount_RNA"                     
#>  [3] "nFeature_RNA"                    "zeros"                          
#>  [5] "Genotype"                        "Mouse"                          
#>  [7] "Pool"                            "percent.mito"                   
#>  [9] "nCount_SCT"                      "nFeature_SCT"                   
#> [11] "SCT_snn_res.0.4"                 "seurat_clusters"                
#> [13] "SCT_snn_res.0.6"                 "SCT_snn_res.0.1"                
#> [15] "SCT_snn_res.0.2"                 "SCT_snn_res.0.3"                
#> [17] "SCT_snn_res.0.5"                 "SCT_snn_res.0.7"                
#> [19] "SCT_snn_res.0.8"                 "SCT_snn_res.0.9"                
#> [21] "SCT_snn_res.1"                   "pANN_0.25_0.03_33"              
#> [23] "DF.classifications_0.25_0.03_33" "SCT_snn_res.1.1"                
#> [25] "SCT_snn_res.1.2"                 "SCT_snn_res.1.3"                
#> [27] "SCT_snn_res.1.4"                 "SCT_snn_res.1.5"                
#> [29] "SCT_snn_res.1.6"                 "SCT_snn_res.1.7"                
#> [31] "SCT_snn_res.1.8"                 "SCT_snn_res.1.9"                
#> [33] "SCT_snn_res.2"                   "pANN_0.25_0.03_31"              
#> [35] "DF.classifications_0.25_0.03_31"
#remove all metadata,which is not required anymore
Seurat_objects$processed@meta.data<-Seurat_objects$processed[[]][,-grep("SCT_",names(Seurat_objects$processed[[]]),fixed = T)]
# Seurat_objects$processed@meta.data<-Seurat_objects$processed[[]][,-grep("DF.classifications",names(Seurat_objects$processed[[]]),fixed = T)]
# Seurat_objects$processed@meta.data<-Seurat_objects$processed[[]][,-grep("pANN_",names(Seurat_objects$processed[[]]),fixed = T)]


###find optimal #cluster using clustree
for(i in seq(0.1,1.2,0.1)){
Seurat_objects$processed<- FindClusters(Seurat_objects$processed, resolution = i)
}
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#> 
#> Number of nodes: 5308
#> Number of edges: 217614
#> 
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.9440
#> Number of communities: 6
#> Elapsed time: 0 seconds
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#> 
#> Number of nodes: 5308
#> Number of edges: 217614
#> 
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.9167
#> Number of communities: 7
#> Elapsed time: 0 seconds
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#> 
#> Number of nodes: 5308
#> Number of edges: 217614
#> 
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.8978
#> Number of communities: 10
#> Elapsed time: 0 seconds
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#> 
#> Number of nodes: 5308
#> Number of edges: 217614
#> 
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.8817
#> Number of communities: 10
#> Elapsed time: 0 seconds
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#> 
#> Number of nodes: 5308
#> Number of edges: 217614
#> 
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.8655
#> Number of communities: 10
#> Elapsed time: 0 seconds
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#> 
#> Number of nodes: 5308
#> Number of edges: 217614
#> 
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.8513
#> Number of communities: 11
#> Elapsed time: 0 seconds
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#> 
#> Number of nodes: 5308
#> Number of edges: 217614
#> 
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.8389
#> Number of communities: 13
#> Elapsed time: 0 seconds
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#> 
#> Number of nodes: 5308
#> Number of edges: 217614
#> 
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.8273
#> Number of communities: 14
#> Elapsed time: 0 seconds
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#> 
#> Number of nodes: 5308
#> Number of edges: 217614
#> 
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.8170
#> Number of communities: 15
#> Elapsed time: 0 seconds
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#> 
#> Number of nodes: 5308
#> Number of edges: 217614
#> 
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.8077
#> Number of communities: 15
#> Elapsed time: 0 seconds
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#> 
#> Number of nodes: 5308
#> Number of edges: 217614
#> 
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.7983
#> Number of communities: 16
#> Elapsed time: 0 seconds
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#> 
#> Number of nodes: 5308
#> Number of edges: 217614
#> 
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.7897
#> Number of communities: 16
#> Elapsed time: 0 seconds

  clustree(Seurat_objects$processed, prefix = "SCT_snn_res.")

Now that we have are completely QC corrected dataset, Let’s use the Nbclust tool to determine the optimal number of clusters. Nbclust performs a validation for the optimal number of cluster, using 33 Tools. The function optimal_n_cluster is a wrapper. It first executes NBclust with all available tools in parallel and then saves the results in class “nbclust”:


optimal_clusters<-optimal_n_cluster(Seurat_objects$processed,min.nc = 10,max.nc = 25)
#> Warning: Removed 2 row(s) containing missing values (geom_path).
#> Warning: Removed 2 rows containing missing values (geom_point).

optimal_clusters@top3
#> [1] "22" "10" "12"


DimPlot(Seurat_objects$processed)

One could sometimes also find plots in , here it did not work. But the top 3 number of clusters from all the tools ,were 10,14,12 clusters. Together with the results of nbclust, 10 clusters seems to be optimal. Upon visual inspection of the UMAP, further subgroups in the clusters 3,4,5,1 and 9 were found, which we were not able to separate, when considering all cells. For these cells we can perform subclustering:

Subclustering

Lets first remove T cells from analysis:

###set best resolution
cluster_ids<-list()
Idents(Seurat_objects$processed)<-Seurat_objects$processed[["SCT_snn_res.0.5"]]

  dataset.markers$processed<-FindAllMarkers(Seurat_objects$processed)
    

    top_genes$processed<- dataset.markers$processed %>% group_by(cluster) %>% top_n(n = 5, 
                                                                   wt = avg_log2FC)
    
    DoHeatmap(Seurat_objects$processed,top_genes$processed$gene,group.colors = my_cols2)+NoLegend()

    
     ###remove Cd8+ cells
   Seurat_objects$processed<-subset(Seurat_objects$processed,idents=seq(0,9,1)[-8])
###need to rerun seurat flow after subsetting
Seurat_objects$processed<-seurat_flow(Seurat_objects$processed,res=0.5,dim = 1:21,umap="uwot",only.var = F)
   ####marker genes

### Visual inspection showed possible subclusters for cluster 3,4,5 ... thus performed subclustering
   Seurat_objects$processed<-FindSubCluster(Seurat_objects$processed,cluster = "3",graph.name = "SCT_snn",res=0.25)
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#> 
#> Number of nodes: 610
#> Number of edges: 19276
#> 
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.7735
#> Number of communities: 3
#> Elapsed time: 0 seconds
   Idents(Seurat_objects$processed)<-Seurat_objects$processed$sub.cluster
     Seurat_objects$processed<-FindSubCluster(Seurat_objects$processed,cluster = "4",graph.name = "SCT_snn",resolution =0.15)
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#> 
#> Number of nodes: 405
#> Number of edges: 14031
#> 
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.8659
#> Number of communities: 2
#> Elapsed time: 0 seconds
   Idents(Seurat_objects$processed)<-Seurat_objects$processed$sub.cluster
      Seurat_objects$processed<-FindSubCluster(Seurat_objects$processed,cluster = "2",graph.name = "SCT_snn",resolution = 0.35)
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#> 
#> Number of nodes: 701
#> Number of edges: 24099
#> 
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.7425
#> Number of communities: 3
#> Elapsed time: 0 seconds
   Idents(Seurat_objects$processed)<-Seurat_objects$processed$sub.cluster

 
 
DimPlot(Seurat_objects$processed,label=T)

Now you can see, that all cluster capture one homogenous cell group in the UMAP (at least visually). Now we can identify the clusters:

  
  
   dataset.markers$processed<-FindAllMarkers(Seurat_objects$processed)
#> Calculating cluster 1
#> Calculating cluster 8
#> Calculating cluster 3_0
#> Calculating cluster 2_1
#> Calculating cluster 2_0
#> Calculating cluster 0
#> Calculating cluster 5
#> Calculating cluster 2_2
#> Calculating cluster 4_0
#> Calculating cluster 3_1
#> Calculating cluster 7
#> Calculating cluster 9
#> Calculating cluster 3_2
#> Calculating cluster 10
#> Calculating cluster 4_1
#> Calculating cluster 6
    

    top_genes$processed<- dataset.markers$processed %>% group_by(cluster) %>% top_n(n = 5, 
                                                                   wt = avg_log2FC)


   

    ####rename cluster ####
cluster_ids$processed <- c("ss_MG","early_NP","int_homeo_MG","homeo_DAM","inflam_DAM","int_MG","int_NP","type_1_MF","CAM","homeo_MG","inflam_MG","inflam_NP","SC","MC","IR_MF","IR_DAM")


###asign the clusters to be changed, as names to the new cluster identities
names(cluster_ids$processed) <- levels(Idents(Seurat_objects$processed))

### renamve the clusters based on this named vector 
Seurat_objects$processed <- RenameIdents(Seurat_objects$processed, cluster_ids$processed)
#check if the cluster IDs were changed successfully 
levels(Idents(Seurat_objects$processed))
#>  [1] "ss_MG"        "early_NP"     "int_homeo_MG" "homeo_DAM"    "inflam_DAM"  
#>  [6] "int_MG"       "int_NP"       "type_1_MF"    "CAM"          "homeo_MG"    
#> [11] "inflam_MG"    "inflam_NP"    "SC"           "MC"           "IR_MF"       
#> [16] "IR_DAM"
    
##don't forget to rename the color palette for the new cluster names 

names(my_cols)<-levels(Idents(Seurat_objects$processed))

my_cols2 <- my_cols[order(as.integer(names(my_cols)))]
#> Warning in eval(quote(list(...)), env): NAs introduced by coercion




   plot_grid(DoHeatmap(Seurat_objects$processed,top_genes$processed$gene,group.colors = my_cols2)+NoLegend(),DimPlot(Seurat_objects$processed,label = T,cols = my_cols2)+NoAxes()+NoLegend())

Cluster abundances

Since we know the identities of our clusters, we can check the differences between the genotypes, lets first create a Barplot showing the cluster abundances:


####Cluster cell numbers####

###extract frequencies of each seurat clusters splitted into genotypes as table from Seurat object total
table_cluster<-table(Seurat_objects$processed@active.ident,Seurat_objects$processed$Genotype)
table_frame<- as.data.frame(table_cluster)
colnames(table_frame)<-c("Seurat_cluster","Genotype","Freq")
table_frame$Genotype<-relevel(table_frame$Genotype,"WT")

####required for the total cell number labels in the plot later
table_geno<-table(Seurat_objects$processed$Genotype)
table_geno<- as.data.frame(table_geno)
table_geno$Seurat_cluster<-"act. MG"
colnames(table_geno)<-c("Genotype","Freq","Seurat_cluster")
table_geno$Genotype<-relevel(table_geno$Genotype,"WT")


#####relative cell numbers 


##compare Genotype 

###total cell numbers
props_total_t<-table_geno%>%dplyr::group_by(Genotype)%>%dplyr::mutate(total=sum(Freq),prop=sum(Freq/sum(Freq)))

### cell percentage for each cluster

props_total_g<-table_frame%>%dplyr::group_by(Genotype)%>%dplyr::mutate(prop=Freq / sum(Freq))
props_total_g<-as.data.frame(props_total_g)


plot_list_g<-list()
 plot_list_g<-sapply(colnames(props_total_g),gen_plots,data=props_total_g,list=plot_list_g,xvalue="Genotype",yvalue="prop")
 # names(plot_list_g)<-colnames(props_total_g)
 ##we need only the plot for the clusters splitted into genotypes
 plot_list_g<-plot_list_g[c(1)]
 
 graph_list_g<-list()
 ###you need to have the same number colors for the clusters otherwise NA will be shown 
 my_cols2<-my_cols2[-c(17,18,19)]
 graph_list_g<-lapply(plot_list_g,scRNAseq.analysis::draw_plot,ylab="rel. Abundance",xlab="",plot=geom_col(),cluster_colors = my_cols2)
###plot with percentages and absolute cell numbers across genotypes and cell clusters 
cluster_abs<-graph_list_g$Seurat_cluster.col+geom_text(aes(label =paste(Freq,"cells",sep=" ")),  position = position_stack(vjust = 0.5),size=1.5)+geom_text(data=props_total_t,aes(label =total,y=prop,x=Genotype),   vjust = -.25,size=3)
# 
# saveRDS(cluster_abs,here::here("figures","cluster_abs.RDS"))
plot_grid(DimPlot(Seurat_objects$processed,group.by = "Genotype")+theme(title = element_text(size=9)),DimPlot(Seurat_objects$processed,cols = my_cols2,label=T)+NoLegend(),cluster_abs)

DE Genes & GOEA

First we need to extract all genes in the dataset and translate them into EntrezID, this genes will than be used as background for significance testing in the GSEA function:


##check if non protein coding genes give additional information
# Seurat_objects$safe<-Seurat_objects$processed
# Seurat_objects$full<-subset(Seurat_objects$total,cells=colnames(Seurat_objects$processed))
# Seurat_objects$full@reductions$prot_only<-  Seurat::CreateDimReducObject(Seurat_objects[["processed"]]@reductions[["umap"]]@cell.embeddings)
# Idents(Seurat_objects$full)<-Idents(Seurat_objects$processed)
# DimPlot(Seurat_objects$full,reduction = "prot_only")
# Seurat_objects$processed<-Seurat_objects$full

###no real plus, so kept the protein coding only
###need to execute after every restart, otherwise Error: external pointer is not valid
OrgDb = org.Mm.eg.db


###prepare background genes
###select all genes in dataset ... required for stastitical testing ... see "universe" in enricher()
#TODO: Try using only the genes found in the cluster as background, at least for comparing the genotypes
   background <- as.matrix(GetAssayData(object = Seurat_objects$processed, slot = 'counts'))
  background <- rownames(background[apply(background, 1, function (x) {sum(x >= 1)})>3,])
  background <- bitr(background, fromType = "SYMBOL", toType="ENTREZID", OrgDb=OrgDb)
#> 'select()' returned 1:1 mapping between keys and columns
#> Warning in bitr(background, fromType = "SYMBOL", toType = "ENTREZID", OrgDb =
#> OrgDb): 0.37% of input gene IDs are fail to map...
  
  
   ### Get human homologues for mouse genes 
    background_hsa <- getLDS(attributes = c("entrezgene_id"), 
                                                       filters = "entrezgene_id", 
                                                       values = background$ENTREZID, 
                                                       mart = mouse_ensembl, 
                                                       attributesL = c("entrezgene_id"), 
                                                       martL =human_ensembl, 
                                                       uniqueRows=T)
    
     
    colnames(background_hsa)<-c("entrez_m","entrez_h")
  background_hsa$entrez_m<-as.character(background_hsa$entrez_m)
   background_hsa$entrez_h<-as.character(background_hsa$entrez_h)
  
  background<-left_join(background,background_hsa,by=c("ENTREZID"="entrez_m"))
  ##background_hsa was just a helper variable, so lets remove it
rm(background_hsa)

In single cell experiments, there are multiple possibilites, which genes to take for GSEA. First, the marker genes of the single cell clusters can be used to confirm our annotation. Seurat::FindMarkers() will be used on the condition variable, which is in our case the cell clusters, total is set to TRUE, so that the Idents are only considered once.
There are 5 available databases “GO”,“KEGG”,“DO”,“Hallmark”,“cannonicalPathways”,“Motifs”,“ImmunoSignatures”. Lets take all of them and perform GSEA in parallel using foreach :

###perform scRNAseq.analysis::GSEA for all available databases, the results will be saved in list called ontology


###scRNAseq.analysis::GSEA no differential expression between genotypes

###perform GSEA for all available databases, the results will be saved in list called ontology
ontology<-list()


ontology_db<-c("GO","KEGG","DO","Hallmark","cannonicalPathways","Motifs","ImmunoSignatures")

Seurat_objects$processed$seurat_clusters<-Idents(Seurat_objects$processed)
ontology[["nogeno"]]<-NULL
ontology[["nogeno"]]<-foreach(i=ontology_db,.errorhandling = "remove",.combine=c,.multicombine = T) %dopar% {
R.utils::withTimeout({scRNAseq.analysis::GSEA(object = Seurat_objects$processed,condition = "seurat_clusters",top = 50,GeneSets = i,total = T,OrgDb = OrgDb,human_ensembl = human_ensembl,mouse_ensembl = mouse_ensembl)},
                    timeout = 180,cpu=F,onTimeout = "silent")

}
###Here Seurat_objects$processed is my final seurat object 
###sometimes GO doesn't work in parallel, just run scRNAseq.analysis::GSEA in one core mode and it should be fine
ontology$nogeno["GO"]<-scRNAseq.analysis::GSEA(object = Seurat_objects$processed,condition = "seurat_clusters",top = 50,GeneSets = "GO",total = T,OrgDb = OrgDb,human_ensembl = human_ensembl,mouse_ensembl = mouse_ensembl)
#> Calculating cluster ss_MG
#> Calculating cluster early_NP
#> Calculating cluster int_homeo_MG
#> Calculating cluster homeo_DAM
#> Calculating cluster inflam_DAM
#> Calculating cluster int_MG
#> Calculating cluster int_NP
#> Calculating cluster type_1_MF
#> Calculating cluster CAM
#> Calculating cluster homeo_MG
#> Calculating cluster inflam_MG
#> Calculating cluster inflam_NP
#> Calculating cluster SC
#> Calculating cluster MC
#> Calculating cluster IR_MF
#> Calculating cluster IR_DAM
#> 'select()' returned 1:1 mapping between keys and columns
#> Warning in clusterProfiler::bitr(top$gene, fromType = "SYMBOL", toType =
#> "ENTREZID", : 1.62% of input gene IDs are fail to map...

Furthermore, DE genes between Genotypes can be used for GSEA. Here our condition is the Genotype meta data variable, and total=T needs to be set, so that the single cell clusters are not considered in the analysis:




### make scRNAseq.analysis::GSEA for the total cells (all clusters together) comparing WT and KO, if nothing found will be ignored (.errorhandling=remove) ... if you want to see the errormessage set .errorhandling="pass"

ontology[["total"]]<-NULL
ontology[["total"]]<-foreach(i=ontology_db,.errorhandling="remove",.combine=c,.multicombine = T) %dopar% {
  R.utils::withTimeout({scRNAseq.analysis::GSEA(object = Seurat_objects$processed,condition = "Genotype",top = 50,GeneSets = i,total = T,OrgDb = OrgDb,human_ensembl = human_ensembl,mouse_ensembl = mouse_ensembl)},
                    timeout = 180,cpu=F,onTimeout = "silent")

    
}

Finally, the DE Genes between Genotypes in each clustered can be used for GSEA, by setting total=F:

###Then scRNAseq.analysis::GSEA for each cluster individually
ontology[["cluster"]]<-NULL
ontology[["cluster"]]<-foreach(i=ontology_db,.errorhandling = "remove",.combine=c,.multicombine = T) %dopar% {
 R.utils::withTimeout({scRNAseq.analysis::GSEA(object = Seurat_objects$processed,condition = "Genotype",top = 50,GeneSets = i,total = F,OrgDb = OrgDb,human_ensembl = human_ensembl,mouse_ensembl = mouse_ensembl)},
                    timeout = 180,cpu=F,onTimeout = "silent")
  
}

###Here Seurat_objects$processed is my final seurat object 
###sometimes GO doesn't work in parallel, just run scRNAseq.analysis::GSEA in one core mode and it should be fine
ontology$cluster["GO"]<-scRNAseq.analysis::GSEA(object = Seurat_objects$processed,condition = "Genotype",top = 50,GeneSets = "GO",total = F,OrgDb = OrgDb,human_ensembl = human_ensembl,mouse_ensembl = mouse_ensembl)
#> [1] "ss_MG"
#> Calculating cluster WT
#> Calculating cluster KO
#> [1] "early_NP"
#> Calculating cluster WT
#> Calculating cluster KO
#> [1] "int_homeo_MG"
#> Calculating cluster WT
#> Calculating cluster KO
#> [1] "homeo_DAM"
#> Calculating cluster WT
#> Calculating cluster KO
#> [1] "inflam_DAM"
#> Calculating cluster WT
#> Calculating cluster KO
#> [1] "int_MG"
#> Calculating cluster WT
#> Calculating cluster KO
#> [1] "int_NP"
#> Calculating cluster WT
#> Calculating cluster KO
#> [1] "type_1_MF"
#> Calculating cluster WT
#> Calculating cluster KO
#> [1] "CAM"
#> Calculating cluster WT
#> Calculating cluster KO
#> [1] "homeo_MG"
#> Calculating cluster WT
#> Calculating cluster KO
#> [1] "inflam_MG"
#> Calculating cluster WT
#> Calculating cluster KO
#> [1] "inflam_NP"
#> Calculating cluster WT
#> Calculating cluster KO
#> [1] "SC"
#> Calculating cluster WT
#> Calculating cluster KO
#> [1] "MC"
#> Calculating cluster WT
#> Calculating cluster KO
#> [1] "IR_MF"
#> Calculating cluster WT
#> Calculating cluster KO
#> [1] "IR_DAM"
#> Calculating cluster WT
#> Calculating cluster KO
#> 'select()' returned 1:1 mapping between keys and columns
#> Warning in clusterProfiler::bitr(top$gene, fromType = "SYMBOL", toType =
#> "ENTREZID", : 1.24% of input gene IDs are fail to map...

Lets only put WT on the left side of the plot:


###relevel the genotype, so that WT appears on left side 
for(i in ontology_db){
ontology[["total"]][[i]]@compareClusterResult[["cluster"]]<-as.factor(ontology[["total"]][[i]]@compareClusterResult[["cluster"]])
ontology[["total"]][[i]]@compareClusterResult[["cluster"]]<-relevel(ontology[["total"]][[i]]@compareClusterResult[["cluster"]],"WT")
}

for(i in ontology_db){
ontology[["cluster"]][[i]]@compareClusterResult[["Condition"]]<-as.factor(ontology[["cluster"]][[i]]@compareClusterResult[["Condition"]])
ontology[["cluster"]][[i]]@compareClusterResult[["Condition"]]<-relevel(ontology[["cluster"]][[i]]@compareClusterResult[["Condition"]],"WT")
}

We created now only GSEA objects. To visualise the results dotplot can be used, here we can create a list of ggplots, so that we don’t need to run ggplot for each database:

### Create dotplots 
###create dotplots for all the ontologies created above 
ontology_plots<-list()

ontology_plots[["total"]]<-foreach(i=names(ontology$total),.packages = c("clusterProfiler"),.combine=c,.multicombine=T)%dopar%{
  
  ontology_plots[[i]]<-clusterProfiler::dotplot(ontology$total[[i]],x=~cluster,show=5)+theme(axis.text.x = element_text(angle=60,vjust = 1, hjust=1,size=3),axis.text.y = element_text(size=3),title=element_text(size=5),legend.key.size = unit(0.2,"line"),legend.text = element_text(size=3))+ggtitle(i)+scale_size(range = c(0.1,1))
  
    return(ontology_plots)
  }

###create dot plots in separate plots for WT and KO 
plots<-list()
ontology_plots[["cluster"]]<-NULL
ontology_plots[["cluster"]]<-foreach(i=names(ontology$cluster),.packages = c("clusterProfiler"),.combine=c,.multicombine=T)%dopar%{
  
 plots[[i]]<-dotplot(ontology$cluster[[i]],x=~cluster,show=5)+facet_wrap(~Condition)+theme(axis.text.x = element_text(angle=60,vjust = 1, hjust=1,size=3),strip.text = element_text(size=4,margin = margin(0.5,0,0.5,0, "mm")),axis.text.y = element_text(size=3),title=element_text(size=5),legend.key.size = unit(0.2,"line"),legend.text = element_text(size=3))+ggtitle(i)+scale_size(range = c(0.1,1))
  
  return(plots)
}

####create dotplot without comparing genotypes for each cluster


plots<-list()
ontology_plots[["nogeno"]]<-NULL
ontology_plots[["nogeno"]]<-foreach(i=names(ontology$nogeno),.packages = c("clusterProfiler"),.combine=c,.multicombine=T)%dopar%{
  
 plots[[i]]<-dotplot(ontology$nogeno[[i]],x=~cluster,show=5)+theme(axis.text.x = element_text(angle=60,vjust = 1, hjust=1,size=3),axis.text.y = element_text(size=3),title=element_text(size=5),legend.key.size = unit(0.2,"line"),legend.text = element_text(size=3))+ggtitle(i)+scale_size(range = c(0.1,1))
  
  return(plots)
}

# saveRDS(ontology_plots,here::here("RImages/SeqWell_myeolid/Exp_191114","ontology_plots.RDS"))

Using plot_grid, multiple results dotplots can be visualised at once. For each type of marker genes we used, an entry in the ontology_plots list was created. The next level in the list are the single databases, they can be accessed by name or position:

###plot all in one: probably too much, thus need to subset: here only plot 7 will be plotted
levels(Idents(Seurat_objects$processed))
#>  [1] "ss_MG"        "early_NP"     "int_homeo_MG" "homeo_DAM"    "inflam_DAM"  
#>  [6] "int_MG"       "int_NP"       "type_1_MF"    "CAM"          "homeo_MG"    
#> [11] "inflam_MG"    "inflam_NP"    "SC"           "MC"           "IR_MF"       
#> [16] "IR_DAM"
plot_grid(plotlist = ontology_plots$nogeno["GO"])

plot_grid(plotlist = ontology_plots$total[c("GO","DO","KEGG","Motifs")])

plot_grid(plotlist = ontology_plots$cluster[c(1:3)])

In the same way, a network visualisation can be used:

###for visualisation as a network package enrichplot required
library(enrichplot)


###same as above but visualisation as emaplot(network)

emaplots<-list()

emaplots[["total"]]<-foreach(i=names(ontology$total),.packages = c("clusterProfiler"),.combine=c,.multicombine=T,.errorhandling = "remove")%dopar%{
  
  emaplots[[i]]<-emapplot(pairwise_termsim(ontology$total[[i]]),showCategory = 15)
  
  return(emaplots)
}


emaplots[["cluster"]]<-NULL
plots<-list()
emaplots[["cluster"]]<-foreach(i=names(ontology$cluster),.packages = c("clusterProfiler"),.combine=c,.multicombine=T)%dopar%{
  
  plots[[i]]<-emapplot(pairwise_termsim(ontology$cluster[[i]]),showCategory = 15)
  
  return(plots)
}

plots<-list()
emaplots[["nogeno"]]<-NULL
emaplots[["nogeno"]]<-foreach(i=names(ontology$nogeno),.packages = c("clusterProfiler"),.combine=c,.multicombine=T)%dopar%{
  
  plots[[i]]<-emapplot(pairwise_termsim(ontology$nogeno[[i]]),showCategory = 15)
  return(plots)
  
}

plot_grid(plotlist = emaplots$nogeno["GO"])

Volcano Plots

####Volcano Plots ####

tmp<-Seurat_objects$processed

dataset.markers$cluster<-list()
for (i in levels(Idents(tmp))){
  print(i)
  tmp2<-subset(tmp,idents=i)
  Idents(object = tmp2) <- tmp2[[]][,"Genotype"]
  
  
  
  dataset.markers$cluster[[i]] <-FindMarkers(object = tmp2,ident.1 = "KO",ident.2 = "WT",logfc.threshold = 0.1)
  # colnames(dataset.markers$cluster[[i]])[6]<-"Condition"
  dataset.markers$cluster[[i]]$cluster<-as.factor(i)

}
#> [1] "ss_MG"
#> [1] "early_NP"
#> [1] "int_homeo_MG"
#> [1] "homeo_DAM"
#> [1] "inflam_DAM"
#> [1] "int_MG"
#> [1] "int_NP"
#> [1] "type_1_MF"
#> [1] "CAM"
#> [1] "homeo_MG"
#> [1] "inflam_MG"
#> [1] "inflam_NP"
#> [1] "SC"
#> [1] "MC"
#> [1] "IR_MF"
#> [1] "IR_DAM"

Idents(tmp)<-tmp$Genotype
 dataset.markers$geno_total<-FindMarkers(tmp,ident.1 = "KO",ident.2 = "WT",logfc.threshold = 0.1)
 
 rm(tmp,tmp2)
 highlight<-c("Camk1d","Cmss1","Tmem119","Apc","Ctsb","Tmsb4x","S100a9","Fkbp5","Apoe","Anapc15","Ctsb","Junb","Psap","Clec7a","Lgals3bp","Tmem9","Cdk17")
 
# dataset.markers$cluster<-do.call(rbind,dataset.markers$cluster)
#  dataset.markers$cluster %>% group_by(cluster,Condition) %>% top_n(n = 50, wt = avg_log2FC) -> top
Volcano_plots<-list()

###create list of volcano plots 

Volcano_plots$total<-Volcano_plot(dataset.markers$geno_total,FC_cutoff=0.25,p_cutoff=1e-30,legend="none",titles = "Total",lab.size = 2,title.size = 6)+xlab(NULL)+ylab(NULL)+theme(axis.ticks = element_line(size=.3),axis.line =  element_line(size=.3))
#> Registered S3 methods overwritten by 'ggalt':
#>   method                  from   
#>   grid.draw.absoluteGrob  ggplot2
#>   grobHeight.absoluteGrob ggplot2
#>   grobWidth.absoluteGrob  ggplot2
#>   grobX.absoluteGrob      ggplot2
#>   grobY.absoluteGrob      ggplot2

for(i in names(dataset.markers$cluster)){
    Volcano_plots[[i]]<-Volcano_plot(dataset.markers$cluster[[i]],do.col=F,Condition="cluster",FC_cutoff=0.25,p_cutoff=5e-02,legend="none",lab.size=2,title.size=6)+xlab(NULL)+ylab(NULL)+theme(axis.ticks = element_line(size=.3),axis.line =  element_line(size=.3))

}


Volcano_plots<-Volcano_plots[!(names(Volcano_plots)%in%c("inflam_DAM","inflam_NP","MC","IR_MF","int_homeo_MG","homeo_DAM","type_1_MF"))]



#add global x and y axes

y.grob <- textGrob("-Log10(p)", 
                   gp=gpar(fontface="bold", col="black", fontsize=10), rot=90)

x.grob <- textGrob("Log2(FC)", 
                   gp=gpar(fontface="bold", col="black", fontsize=10))

allVolcano<-plot_grid(plotlist = Volcano_plots)+geom_segment(aes(x = 0.01, y = -0.01, xend = 0.01, yend = 0.95),
                  arrow = arrow(length = unit(0.2, "cm")))+geom_segment(aes(x = -0.01, y = 0, xend = 0.95, yend = 0),
                  arrow = arrow(length = unit(0.2, "cm")))+theme(plot.margin=unit(c(7,7,7,7),"mm"))




grid.arrange(arrangeGrob(allVolcano,
                 left = y.grob, bottom = x.grob))




processed<-Seurat_objects$processed


  # saveRDS(Volcano_plots,here::here("RImages/SeqWell_myeolid/Exp_191114","Volcano_plots.RDS"))